{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# _Pre-Requisite: Make sure you have run the notebooks in the `SETUP` and `INGEST` sections._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r ingest_create_athena_table_parquet_passed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    ingest_create_athena_table_parquet_passed\n",
    "except NameError:\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "    print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not convert into Parquet data.\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(ingest_create_athena_table_parquet_passed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not ingest_create_athena_table_parquet_passed:\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "    print(\"[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not convert into Parquet data.\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "else:\n",
    "    print(\"[OK]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize Amazon Customer Reviews Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset Column Descriptions\n",
    "\n",
    "- `marketplace`: 2-letter country code (in this case all \"US\").\n",
    "- `customer_id`: Random identifier that can be used to aggregate reviews written by a single author.\n",
    "- `review_id`: A unique ID for the review.\n",
    "- `product_id`: The Amazon Standard Identification Number (ASIN).  `http://www.amazon.com/dp/<ASIN>` links to the product's detail page.\n",
    "- `product_parent`: The parent of that ASIN.  Multiple ASINs (color or format variations of the same product) can roll up into a single parent.\n",
    "- `product_title`: Title description of the product.\n",
    "- `product_category`: Broad product category that can be used to group reviews (in this case digital videos).\n",
    "- `star_rating`: The review's rating (1 to 5 stars).\n",
    "- `helpful_votes`: Number of helpful votes for the review.\n",
    "- `total_votes`: Number of total votes the review received.\n",
    "- `vine`: Was the review written as part of the [Vine](https://www.amazon.com/gp/vine/help) program?\n",
    "- `verified_purchase`: Was the review from a verified purchase?\n",
    "- `review_headline`: The title of the review itself.\n",
    "- `review_body`: The text of the review.\n",
    "- `review_date`: The date the review was written."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import boto3\n",
    "\n",
    "sess = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set Athena database & table\n",
    "database_name = \"dsoaws\"\n",
    "table_name = \"amazon_reviews_parquet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyathena import connect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set S3 staging directory -- this is a temporary directory used for Athena queries\n",
    "s3_staging_dir = \"s3://{0}/athena/staging\".format(bucket)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Set Seaborn Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_style = \"seaborn-whitegrid\"\n",
    "\n",
    "sns.set(\n",
    "    rc={\n",
    "        \"font.style\": \"normal\",\n",
    "        \"axes.facecolor\": \"white\",\n",
    "        \"grid.color\": \".8\",\n",
    "        \"grid.linestyle\": \"-\",\n",
    "        \"figure.facecolor\": \"white\",\n",
    "        \"figure.titlesize\": 20,\n",
    "        \"text.color\": \"black\",\n",
    "        \"xtick.color\": \"black\",\n",
    "        \"ytick.color\": \"black\",\n",
    "        \"axes.labelcolor\": \"black\",\n",
    "        \"axes.grid\": True,\n",
    "        \"axes.labelsize\": 10,\n",
    "        \"xtick.labelsize\": 10,\n",
    "        \"font.size\": 10,\n",
    "        \"ytick.labelsize\": 10,\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Helper Code to Display Values on Bars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_values_barplot(axs, space):\n",
    "    def _show_on_plot(ax):\n",
    "        for p in ax.patches:\n",
    "            _x = p.get_x() + p.get_width() + float(space)\n",
    "            _y = p.get_y() + p.get_height()\n",
    "            value = round(float(p.get_width()), 2)\n",
    "            ax.text(_x, _y, value, ha=\"left\")\n",
    "\n",
    "    if isinstance(axs, np.ndarray):\n",
    "        for idx, ax in np.ndenumerate(axs):\n",
    "            _show_on_plot(ax)\n",
    "    else:\n",
    "        _show_on_plot(axs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Which Product Categories are Highest Rated by Average Rating?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_category, AVG(star_rating) AS avg_star_rating\n",
    "FROM {}.{} \n",
    "GROUP BY product_category \n",
    "ORDER BY avg_star_rating DESC\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_sql(statement, conn)\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store number of categories\n",
    "num_categories = df.shape[0]\n",
    "print(num_categories)\n",
    "\n",
    "# Store average star ratings\n",
    "average_star_ratings = df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Create plot\n",
    "barplot = sns.barplot(y=\"product_category\", x=\"avg_star_rating\", data=df, saturation=1)\n",
    "\n",
    "if num_categories < 10:\n",
    "    sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
    "\n",
    "# Set title and x-axis ticks\n",
    "plt.title(\"Average Rating by Product Category\")\n",
    "plt.xticks([1, 2, 3, 4, 5], [\"1-Star\", \"2-Star\", \"3-Star\", \"4-Star\", \"5-Star\"])\n",
    "\n",
    "# Helper code to show actual values afters bars\n",
    "show_values_barplot(barplot, 0.1)\n",
    "\n",
    "plt.xlabel(\"Average Rating\")\n",
    "plt.ylabel(\"Product Category\")\n",
    "\n",
    "# Export plot if needed\n",
    "plt.tight_layout()\n",
    "# plt.savefig('avg_ratings_per_category.png', dpi=300)\n",
    "\n",
    "# Show graphic\n",
    "plt.show(barplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c5-01.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Which Product Categories Have the Most Reviews?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_category, COUNT(star_rating) AS count_star_rating \n",
    "FROM {}.{}\n",
    "GROUP BY product_category \n",
    "ORDER BY count_star_rating DESC\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store counts\n",
    "count_ratings = df[\"count_star_rating\"]\n",
    "\n",
    "# Store max ratings\n",
    "max_ratings = df[\"count_star_rating\"].max()\n",
    "print(max_ratings)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Seaborn barplot\n",
    "barplot = sns.barplot(y=\"product_category\", x=\"count_star_rating\", data=df, saturation=1)\n",
    "\n",
    "if num_categories < 10:\n",
    "    sns.set(rc={\"figure.figsize\": (10.0, 5.0)})\n",
    "\n",
    "# Set title\n",
    "plt.title(\"Number of Ratings per Product Category for Subset of Product Categories\")\n",
    "\n",
    "# Set x-axis ticks to match scale\n",
    "if max_ratings > 200000:\n",
    "    plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], [\"100K\", \"1m\", \"5m\", \"10m\", \"15m\", \"20m\"])\n",
    "    plt.xlim(0, 20000000)\n",
    "elif max_ratings <= 200000:\n",
    "    plt.xticks([50000, 100000, 150000, 200000], [\"50K\", \"100K\", \"150K\", \"200K\"])\n",
    "    plt.xlim(0, 200000)\n",
    "\n",
    "plt.xlabel(\"Number of Ratings\")\n",
    "plt.ylabel(\"Product Category\")\n",
    "\n",
    "plt.tight_layout()\n",
    "\n",
    "# Export plot if needed\n",
    "# plt.savefig('ratings_per_category.png', dpi=300)\n",
    "\n",
    "# Show the barplot\n",
    "plt.show(barplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c5-02.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. When did each product category become available in the Amazon catalog based on the date of the first review?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_category, MIN(review_date) AS first_review_date\n",
    "FROM {}.{}\n",
    "GROUP BY product_category\n",
    "ORDER BY first_review_date \n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert date strings (e.g. 2014-10-18) to datetime\n",
    "import datetime as datetime\n",
    "\n",
    "dates = pd.to_datetime(df[\"first_review_date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See: https://stackoverflow.com/questions/60761410/how-to-graph-events-on-a-timeline\n",
    "\n",
    "\n",
    "def modify_dataframe(df):\n",
    "    \"\"\" Modify dataframe to include new columns \"\"\"\n",
    "    df[\"year\"] = pd.to_datetime(df[\"first_review_date\"], format=\"%Y-%m-%d\").dt.year\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_x_y(df):\n",
    "    \"\"\" Get X and Y coordinates; return tuple \"\"\"\n",
    "    series = df[\"year\"].value_counts().sort_index()\n",
    "    # new_series = series.reindex(range(1,21)).fillna(0).astype(int)\n",
    "    return series.index, series.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = modify_dataframe(df)\n",
    "print(new_df)\n",
    "\n",
    "X, Y = get_x_y(new_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12, 5))\n",
    "ax = plt.gca()\n",
    "\n",
    "ax.set_title(\"Number Of First Product Category Reviews Per Year for Subset of Categories\")\n",
    "ax.set_xlabel(\"Year\")\n",
    "ax.set_ylabel(\"Count\")\n",
    "\n",
    "ax.plot(X, Y, color=\"black\", linewidth=2, marker=\"o\")\n",
    "ax.fill_between(X, [0] * len(X), Y, facecolor=\"lightblue\")\n",
    "\n",
    "ax.locator_params(integer=True)\n",
    "\n",
    "ax.set_xticks(range(1995, 2016, 1))\n",
    "ax.set_yticks(range(0, max(Y) + 2, 1))\n",
    "\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "# fig.savefig('first_reviews_per_year.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-04.png\"  width=\"90%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. What is the breakdown of ratings (1-5) per product category?  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_category,\n",
    "         star_rating,\n",
    "         COUNT(*) AS count_reviews\n",
    "FROM {}.{}\n",
    "GROUP BY  product_category, star_rating\n",
    "ORDER BY  product_category ASC, star_rating DESC, count_reviews\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare for Stacked Percentage Horizontal Bar Plot Showing Proportion of Star Ratings per Product Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create grouped DataFrames by category and by star rating\n",
    "grouped_category = df.groupby(\"product_category\")\n",
    "grouped_star = df.groupby(\"star_rating\")\n",
    "\n",
    "# Create sum of ratings per star rating\n",
    "df_sum = df.groupby([\"star_rating\"]).sum()\n",
    "\n",
    "# Calculate total number of star ratings\n",
    "total = df_sum[\"count_reviews\"].sum()\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dictionary of product categories and array of star rating distribution per category\n",
    "distribution = {}\n",
    "count_reviews_per_star = []\n",
    "i = 0\n",
    "\n",
    "for category, ratings in grouped_category:\n",
    "    count_reviews_per_star = []\n",
    "    for star in ratings[\"star_rating\"]:\n",
    "        count_reviews_per_star.append(ratings.at[i, \"count_reviews\"])\n",
    "        i = i + 1\n",
    "    distribution[category] = count_reviews_per_star\n",
    "\n",
    "# Check if distribution has been created succesfully\n",
    "print(distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if distribution keys are set correctly to product categories\n",
    "print(distribution.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if star rating distributions are set correctly\n",
    "print(distribution.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort distribution by average rating per category\n",
    "sorted_distribution = {}\n",
    "\n",
    "average_star_ratings.iloc[:, 0]\n",
    "for index, value in average_star_ratings.iloc[:, 0].items():\n",
    "    sorted_distribution[value] = distribution[value]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sorted_distribution_pct = pd.DataFrame(sorted_distribution).transpose().apply(\n",
    "    lambda num_ratings: num_ratings/sum(num_ratings)*100, axis=1\n",
    ")\n",
    "df_sorted_distribution_pct.columns=['5', '4', '3', '2', '1']\n",
    "df_sorted_distribution_pct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = df_sorted_distribution_pct.index\n",
    "\n",
    "# Plot bars\n",
    "if len(categories) > 10:\n",
    "    plt.figure(figsize=(10,10))\n",
    "else: \n",
    "    plt.figure(figsize=(10,5))\n",
    "\n",
    "df_sorted_distribution_pct.plot(kind=\"barh\", \n",
    "                                stacked=True, \n",
    "                                edgecolor='white',\n",
    "                                width=1.0,\n",
    "                                color=['green', \n",
    "                                       'orange', \n",
    "                                       'blue', \n",
    "                                       'purple', \n",
    "                                       'red'])\n",
    "\n",
    "plt.title(\"Distribution of Reviews Per Rating Per Category\", \n",
    "          fontsize='16')\n",
    "\n",
    "plt.legend(bbox_to_anchor=(1.04,1), \n",
    "           loc=\"upper left\",\n",
    "           labels=['5-Star Ratings', \n",
    "                   '4-Star Ratings', \n",
    "                   '3-Star Ratings', \n",
    "                   '2-Star Ratings', \n",
    "                   '1-Star Ratings'])\n",
    "\n",
    "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n",
    "plt.gca().invert_yaxis()\n",
    "plt.tight_layout()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c5-04.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. How Many Reviews per Star Rating? (5, 4, 3, 2, 1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT star_rating,\n",
    "         COUNT(*) AS count_reviews\n",
    "FROM dsoaws.amazon_reviews_parquet\n",
    "GROUP BY  star_rating\n",
    "ORDER BY  star_rating DESC, count_reviews \n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "<img src=\"img/star_rating_count_all.png\"  width=\"25%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chart = df.plot.bar(\n",
    "    x=\"star_rating\", y=\"count_reviews\", rot=\"0\", figsize=(10, 5), title=\"Review Count by Star Ratings\", legend=False\n",
    ")\n",
    "\n",
    "plt.xlabel(\"Star Rating\")\n",
    "plt.ylabel(\"Review Count\")\n",
    "\n",
    "plt.show(chart)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "\n",
    "<img src=\"img/star_rating_count_all_bar_chart.png\"  width=\"60%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. How Did Star Ratings Change Over Time?\n",
    "Is there a drop-off point for certain product categories throughout the year?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average Star Rating Across All Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT year, ROUND(AVG(star_rating),4) AS avg_rating\n",
    "FROM {}.{}\n",
    "GROUP BY year\n",
    "ORDER BY year\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"year\"] = pd.to_datetime(df[\"year\"], format=\"%Y\").dt.year"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.gcf()\n",
    "fig.set_size_inches(12, 5)\n",
    "\n",
    "fig.suptitle(\"Average Star Rating Over Time (Across Subset of Product Categories)\")\n",
    "\n",
    "ax = plt.gca()\n",
    "# ax = plt.gca().set_xticks(df['year'])\n",
    "ax.locator_params(integer=True)\n",
    "ax.set_xticks(df[\"year\"].unique())\n",
    "\n",
    "df.plot(kind=\"line\", x=\"year\", y=\"avg_rating\", color=\"red\", ax=ax)\n",
    "\n",
    "# plt.xticks(range(1995, 2016, 1))\n",
    "# plt.yticks(range(0,6,1))\n",
    "plt.xlabel(\"Years\")\n",
    "plt.ylabel(\"Average Star Rating\")\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "# fig.savefig('average-rating.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-06.png\"  width=\"90%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average Star Rating Per Product Categories Across Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_category, year, ROUND(AVG(star_rating), 4) AS avg_rating_category\n",
    "FROM {}.{}\n",
    "GROUP BY product_category, year\n",
    "ORDER BY year \n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_categories(df):\n",
    "    df_categories = df[\"product_category\"].unique()\n",
    "    for category in df_categories:\n",
    "        # print(category)\n",
    "        df_plot = df.loc[df[\"product_category\"] == category]\n",
    "        df_plot.plot(\n",
    "            kind=\"line\",\n",
    "            x=\"year\",\n",
    "            y=\"avg_rating_category\",\n",
    "            c=np.random.rand(\n",
    "                3,\n",
    "            ),\n",
    "            ax=ax,\n",
    "            label=category,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.gcf()\n",
    "fig.set_size_inches(12, 5)\n",
    "\n",
    "fig.suptitle(\"Average Star Rating Over Time Across Subset Of Categories\")\n",
    "\n",
    "ax = plt.gca()\n",
    "\n",
    "ax.locator_params(integer=True)\n",
    "ax.set_xticks(df[\"year\"].unique())\n",
    "\n",
    "plot_categories(df)\n",
    "\n",
    "plt.xlabel(\"Year\")\n",
    "plt.ylabel(\"Average Star Rating\")\n",
    "plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n",
    "\n",
    "# fig.savefig('average_rating_category_all_data.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories, you would see the following visualization:\n",
    "\n",
    "<img src=\"img/average_rating_category_all_data.png\"  width=\"90%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7. Which Star Ratings (1-5) are Most Helpful?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT star_rating,\n",
    "         AVG(helpful_votes) AS avg_helpful_votes\n",
    "FROM {}.{}\n",
    "GROUP BY  star_rating\n",
    "ORDER BY  star_rating ASC\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "<img src=\"img/star_rating_helpful_all.png\"  width=\"25%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for a Subset of Product Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chart = df.plot.bar(\n",
    "    x=\"star_rating\", y=\"avg_helpful_votes\", rot=\"0\", figsize=(10, 5), title=\"Helpfulness Of Star Ratings\", legend=False\n",
    ")\n",
    "\n",
    "plt.xlabel(\"Star Rating\")\n",
    "plt.ylabel(\"Average Helpful Votes\")\n",
    "\n",
    "# chart.get_figure().savefig('helpful-votes.png', dpi=300)\n",
    "plt.show(chart)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-08.png\"  width=\"70%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 8. Which Products have Most Helpful Reviews?  How Long are the Most Helpful Reviews?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT product_title,\n",
    "         helpful_votes,\n",
    "         star_rating,\n",
    "         LENGTH(review_body) AS review_body_length,\n",
    "         SUBSTR(review_body, 1, 100) AS review_body_substr\n",
    "FROM {}.{}\n",
    "ORDER BY helpful_votes DESC LIMIT 10 \n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "<img src=\"img/most_helpful_all.png\"  width=\"90%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 9. What is the Ratio of Positive (5, 4) to Negative (3, 2 ,1) Reviews?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT (CAST(positive_review_count AS DOUBLE) / CAST(negative_review_count AS DOUBLE)) AS positive_to_negative_sentiment_ratio\n",
    "FROM (\n",
    "  SELECT count(*) AS positive_review_count\n",
    "  FROM {}.{}\n",
    "  WHERE star_rating >= 4\n",
    "), (\n",
    "  SELECT count(*) AS negative_review_count\n",
    "  FROM {}.{}\n",
    "  WHERE star_rating < 4\n",
    ")\n",
    "\"\"\".format(\n",
    "    database_name, table_name, database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "<img src=\"img/ratio_all.png\"  width=\"25%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 10. Which Customers are Abusing the Review System by Repeatedly Reviewing the Same Product?  What Was Their Average Star Rating for Each Product?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT customer_id, product_category, product_title, \n",
    "ROUND(AVG(star_rating),4) AS avg_star_rating, COUNT(*) AS review_count \n",
    "FROM dsoaws.amazon_reviews_parquet \n",
    "GROUP BY customer_id, product_category, product_title \n",
    "HAVING COUNT(*) > 1 \n",
    "ORDER BY review_count DESC\n",
    "LIMIT 5\n",
    "\"\"\".format(\n",
    "    database_name, table_name\n",
    ")\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Result for All Product Categories\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "  \n",
    "<img src=\"img/athena-abuse-all.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 11. What is the distribution of review lengths (number of words)?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT CARDINALITY(SPLIT(review_body, ' ')) as num_words\n",
    "FROM dsoaws.amazon_reviews_parquet\n",
    "\"\"\"\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(statement, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = df[\"num_words\"].describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00])\n",
    "summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"num_words\"].plot.hist(xticks=[0, 16, 32, 64, 128, 256], bins=100, range=[0, 256]).axvline(\n",
    "    x=summary[\"80%\"], c=\"red\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Release Resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%html\n",
    "\n",
    "<p><b>Shutting down your kernel for this notebook to release resources.</b></p>\n",
    "<button class=\"sm-command-button\" data-commandlinker-command=\"kernelmenu:shutdown\" style=\"display:none;\">Shutdown Kernel</button>\n",
    "        \n",
    "<script>\n",
    "try {\n",
    "    els = document.getElementsByClassName(\"sm-command-button\");\n",
    "    els[0].click();\n",
    "}\n",
    "catch(err) {\n",
    "    // NoOp\n",
    "}    \n",
    "</script>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "\n",
    "try {\n",
    "    Jupyter.notebook.save_checkpoint();\n",
    "    Jupyter.notebook.session.delete();\n",
    "}\n",
    "catch(err) {\n",
    "    // NoOp\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
